{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting category_encoders\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/a0/52/c54191ad3782de633ea3d6ee3bb2837bda0cf3bc97644bb6375cf14150a0/category_encoders-2.1.0-py2.py3-none-any.whl (100kB)\n",
      "\u001b[K     |████████████████████████████████| 102kB 3.1MB/s ta 0:00:01\n",
      "\u001b[?25hCollecting patsy>=0.4.1\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/ea/0c/5f61f1a3d4385d6bf83b83ea495068857ff8dfb89e74824c6e9eb63286d8/patsy-0.5.1-py2.py3-none-any.whl (231kB)\n",
      "\u001b[K     |████████████████████████████████| 235kB 8.4MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied: numpy>=1.11.3 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from category_encoders) (1.17.0)\n",
      "Collecting statsmodels>=0.6.1\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b5/b8/50f9b86bbd87b1de961f439c2b93dfc41dd0cb9d65f6b7d824b287b50b21/statsmodels-0.10.1-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (10.5MB)\n",
      "\u001b[K     |████████████████████████████████| 10.5MB 32.7MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied: pandas>=0.21.1 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from category_encoders) (0.25.0)\n",
      "Requirement already satisfied: scipy>=0.19.0 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from category_encoders) (1.3.1)\n",
      "Requirement already satisfied: scikit-learn>=0.20.0 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from category_encoders) (0.21.3)\n",
      "Requirement already satisfied: six in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from patsy>=0.4.1->category_encoders) (1.12.0)\n",
      "Requirement already satisfied: python-dateutil>=2.6.1 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from pandas>=0.21.1->category_encoders) (2.8.0)\n",
      "Requirement already satisfied: pytz>=2017.2 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from pandas>=0.21.1->category_encoders) (2019.2)\n",
      "Requirement already satisfied: joblib>=0.11 in /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages (from scikit-learn>=0.20.0->category_encoders) (0.14.0)\n",
      "Installing collected packages: patsy, statsmodels, category-encoders\n",
      "Successfully installed category-encoders-2.1.0 patsy-0.5.1 statsmodels-0.10.1\n"
     ]
    }
   ],
   "source": [
    "!pip install category_encoders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from category_encoders import OrdinalEncoder\n",
    "import xgboost as xgb\n",
    "from xgboost.sklearn import XGBClassifier\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from bayes_opt import bayesian_optimization\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.metrics import roc_auc_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train dataset shape is: (9674, 15)\n",
      "test dataset shape is: (11182, 15)\n"
     ]
    }
   ],
   "source": [
    "# question 1: read adult data.txt as train, adult test.txt as test, rename their headers to ['age', \n",
    "# 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship',\n",
    "# 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'wage_class']\n",
    "column_name = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', \n",
    "               'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', \n",
    "               'hours_per_week', 'native_country', 'wage_class']\n",
    "df_train = pd.read_csv('adult+data.txt', header = None)\n",
    "df_test = pd.read_csv('adult+test.txt', header = None)\n",
    "df_train.columns = column_name\n",
    "df_test.columns = column_name\n",
    "print(\"train dataset shape is: \" + str(df_train.shape))\n",
    "print(\"test dataset shape is: \" + str(df_test.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train dataset shape without '?' is: (8936, 15)\n",
      "test dataset shape without '?' is: (10347, 15)\n"
     ]
    }
   ],
   "source": [
    "# question 2: replace ‘ ?’ with nan and drop na\n",
    "df_train_nona = df_train.replace(' ?', np.nan).dropna()\n",
    "df_test_nona = df_test.replace(' ?', np.nan).dropna()\n",
    "print(\"train dataset shape without '?' is: \" + str(df_train_nona.shape))\n",
    "print(\"test dataset shape without '?' is: \" + str(df_test_nona.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# question 3: replace test wage_class all of the ‘<=50K.’ \n",
    "# with ‘<=50K’ and the same for ‘>50K.’ with ‘>50K’, same as train\n",
    "df_test_rep = df_test_nona.replace([' >50K.', ' <=50K.'], ['>50K', '<=50K'])\n",
    "df_train_rep = df_train_nona.replace([' >50K', ' <=50K'], ['>50K', '<=50K'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_object = df_train_rep.select_dtypes(include = 'object').columns\n",
    "df_train_final = df_train_rep\n",
    "for i in df_train_rep.columns:\n",
    "    if df_train_rep[i].dtype == 'object':\n",
    "        encoder = OrdinalEncoder()\n",
    "        df_train_final[i] = encoder.fit_transform(df_train_rep[i])\n",
    "for i in df_test_rep.columns:\n",
    "    if df_test_rep[i].dtype == 'object':\n",
    "        encoder = OrdinalEncoder()\n",
    "        df_test_final[i] = encoder.fit_transform(df_test_rep[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>fnlwgt</th>\n",
       "      <th>education_num</th>\n",
       "      <th>capital_gain</th>\n",
       "      <th>capital_loss</th>\n",
       "      <th>hours_per_week</th>\n",
       "      <th>workclass</th>\n",
       "      <th>education</th>\n",
       "      <th>marital_status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>sex</th>\n",
       "      <th>native_country</th>\n",
       "      <th>wage_class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>39</td>\n",
       "      <td>77516</td>\n",
       "      <td>13</td>\n",
       "      <td>2174</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>50</td>\n",
       "      <td>83311</td>\n",
       "      <td>13</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38</td>\n",
       "      <td>215646</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>53</td>\n",
       "      <td>234721</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>28</td>\n",
       "      <td>338409</td>\n",
       "      <td>13</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   age  fnlwgt  education_num  capital_gain  capital_loss  hours_per_week  \\\n",
       "0   39   77516             13          2174             0              40   \n",
       "1   50   83311             13             0             0              13   \n",
       "2   38  215646              9             0             0              40   \n",
       "3   53  234721              7             0             0              40   \n",
       "4   28  338409             13             0             0              40   \n",
       "\n",
       "   workclass  education  marital_status  occupation  relationship  race  sex  \\\n",
       "0          1          1               1           1             1     1    1   \n",
       "1          2          1               2           2             2     1    1   \n",
       "2          3          2               3           3             1     1    1   \n",
       "3          3          3               2           3             2     2    1   \n",
       "4          3          1               2           4             3     2    2   \n",
       "\n",
       "   native_country  wage_class  \n",
       "0               1           1  \n",
       "1               1           1  \n",
       "2               1           1  \n",
       "3               1           1  \n",
       "4               2           1  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# question 4: apply Ordinal Encoding to Categoricals for both train and test\n",
    "# df_train_object = df_train_rep.select_dtypes(include = 'object')\n",
    "# df_test_object = df_test_rep.select_dtypes(include = 'object')\n",
    "# object_columns = df_test_object.columns\n",
    "# train_enc = OrdinalEncoder()\n",
    "# train_enc.fit(df_train_object)\n",
    "# train_fit = train_enc.transform(df_train_object)\n",
    "# df_train_fit = pd.DataFrame(train_fit, columns = object_columns).reset_index()\n",
    "# test_enc = OrdinalEncoder()\n",
    "# test_enc.fit(df_test_object)\n",
    "# test_fit = test_enc.transform(df_test_object)\n",
    "# df_test_fit = pd.DataFrame(test_fit, columns = object_columns).reset_index()\n",
    "# df_train_final = pd.merge(df_train_rep.select_dtypes(exclude = 'object').reset_index(), \n",
    "#                           df_train_fit, on = 'index').drop(columns = 'index')\n",
    "# df_test_final = pd.merge(df_test_rep.select_dtypes(exclude = 'object').reset_index(), \n",
    "#                          df_test_fit, on = 'index').drop(columns = 'index')\n",
    "# df_train_final.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       1\n",
       "1       1\n",
       "2       1\n",
       "3       1\n",
       "4       1\n",
       "       ..\n",
       "9668    1\n",
       "9669    1\n",
       "9670    1\n",
       "9671    1\n",
       "9672    2\n",
       "Name: wage_class, Length: 8936, dtype: int64"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_train = df_train_final['wage_class']\n",
    "y_test = df_test_final['wage_class']\n",
    "x_train = df_train_final.drop(columns = 'wage_class')\n",
    "x_test = df_test_final.drop(columns = 'wage_class')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | subsample |\n",
      "-------------------------------------------------------------------------------------------------------------\n",
      "| \u001b[0m 1       \u001b[0m | \u001b[0m 0.9173  \u001b[0m | \u001b[0m 0.8197  \u001b[0m | \u001b[0m 0.5904  \u001b[0m | \u001b[0m 0.03409 \u001b[0m | \u001b[0m 3.965   \u001b[0m | \u001b[0m 3.862   \u001b[0m | \u001b[0m 1.679e+0\u001b[0m | \u001b[0m 0.8783  \u001b[0m |\n",
      "| \u001b[0m 2       \u001b[0m | \u001b[0m 0.909   \u001b[0m | \u001b[0m 0.8318  \u001b[0m | \u001b[0m 0.2127  \u001b[0m | \u001b[0m 0.08216 \u001b[0m | \u001b[0m 3.847   \u001b[0m | \u001b[0m 1.637   \u001b[0m | \u001b[0m 1.909e+0\u001b[0m | \u001b[0m 0.7945  \u001b[0m |\n",
      "| \u001b[0m 3       \u001b[0m | \u001b[0m 0.9136  \u001b[0m | \u001b[0m 0.7597  \u001b[0m | \u001b[0m 0.3406  \u001b[0m | \u001b[0m 0.03177 \u001b[0m | \u001b[0m 6.164   \u001b[0m | \u001b[0m 1.203   \u001b[0m | \u001b[0m 1e+03   \u001b[0m | \u001b[0m 0.7425  \u001b[0m |\n",
      "| \u001b[0m 4       \u001b[0m | \u001b[0m 0.9087  \u001b[0m | \u001b[0m 0.7246  \u001b[0m | \u001b[0m 0.5289  \u001b[0m | \u001b[0m 0.04529 \u001b[0m | \u001b[0m 6.931   \u001b[0m | \u001b[0m 1.029   \u001b[0m | \u001b[0m 1.385e+0\u001b[0m | \u001b[0m 0.8579  \u001b[0m |\n",
      "| \u001b[0m 5       \u001b[0m | \u001b[0m 0.906   \u001b[0m | \u001b[0m 0.7466  \u001b[0m | \u001b[0m 0.1452  \u001b[0m | \u001b[0m 0.04894 \u001b[0m | \u001b[0m 6.975   \u001b[0m | \u001b[0m 1.123   \u001b[0m | \u001b[0m 1.71e+03\u001b[0m | \u001b[0m 0.8905  \u001b[0m |\n",
      "| \u001b[0m 6       \u001b[0m | \u001b[0m 0.9091  \u001b[0m | \u001b[0m 0.7614  \u001b[0m | \u001b[0m 0.5287  \u001b[0m | \u001b[0m 0.08584 \u001b[0m | \u001b[0m 3.485   \u001b[0m | \u001b[0m 4.593   \u001b[0m | \u001b[0m 2e+03   \u001b[0m | \u001b[0m 0.7103  \u001b[0m |\n",
      "| \u001b[0m 7       \u001b[0m | \u001b[0m 0.9137  \u001b[0m | \u001b[0m 0.8103  \u001b[0m | \u001b[0m 0.9139  \u001b[0m | \u001b[0m 0.07961 \u001b[0m | \u001b[0m 3.001   \u001b[0m | \u001b[0m 4.831   \u001b[0m | \u001b[0m 1.168e+0\u001b[0m | \u001b[0m 0.7176  \u001b[0m |\n",
      "| \u001b[0m 8       \u001b[0m | \u001b[0m 0.9115  \u001b[0m | \u001b[0m 0.8721  \u001b[0m | \u001b[0m 0.05587 \u001b[0m | \u001b[0m 0.08241 \u001b[0m | \u001b[0m 3.033   \u001b[0m | \u001b[0m 4.954   \u001b[0m | \u001b[0m 1.536e+0\u001b[0m | \u001b[0m 0.7695  \u001b[0m |\n",
      "| \u001b[0m 9       \u001b[0m | \u001b[0m 0.9139  \u001b[0m | \u001b[0m 0.7057  \u001b[0m | \u001b[0m 0.5879  \u001b[0m | \u001b[0m 0.0661  \u001b[0m | \u001b[0m 3.036   \u001b[0m | \u001b[0m 4.987   \u001b[0m | \u001b[0m 1.308e+0\u001b[0m | \u001b[0m 0.7044  \u001b[0m |\n",
      "| \u001b[95m 10      \u001b[0m | \u001b[95m 0.9174  \u001b[0m | \u001b[95m 0.8318  \u001b[0m | \u001b[95m 0.9555  \u001b[0m | \u001b[95m 0.01823 \u001b[0m | \u001b[95m 3.006   \u001b[0m | \u001b[95m 4.982   \u001b[0m | \u001b[95m 1.777e+0\u001b[0m | \u001b[95m 0.7437  \u001b[0m |\n",
      "| \u001b[0m 11      \u001b[0m | \u001b[0m 0.9129  \u001b[0m | \u001b[0m 0.7005  \u001b[0m | \u001b[0m 0.809   \u001b[0m | \u001b[0m 0.01088 \u001b[0m | \u001b[0m 3.034   \u001b[0m | \u001b[0m 4.975   \u001b[0m | \u001b[0m 1.061e+0\u001b[0m | \u001b[0m 0.8288  \u001b[0m |\n",
      "| \u001b[0m 12      \u001b[0m | \u001b[0m 0.9139  \u001b[0m | \u001b[0m 0.8406  \u001b[0m | \u001b[0m 0.7304  \u001b[0m | \u001b[0m 0.08605 \u001b[0m | \u001b[0m 3.017   \u001b[0m | \u001b[0m 4.903   \u001b[0m | \u001b[0m 1.001e+0\u001b[0m | \u001b[0m 0.7405  \u001b[0m |\n",
      "=============================================================================================================\n",
      "-----------------------------------------------------\n",
      "Final Results\n",
      "XGBOOST: 0.917366\n"
     ]
    }
   ],
   "source": [
    "# question 5: build xgboost: wage_class is the response variable, and others are predictors\n",
    "# use bayesian to find the best combination of tunning parameters:\n",
    "# 'max_depth': (3,7) 'min_child_weight': (1,5), 'learning_rate': (0.01, 0.1), 'subsample': (0.7,0.9)\n",
    "# 'colsample_bytree': (0.7,0.9), 'n_estimators': (1000, 2000), ' 'gamma': (0.01,1.)\n",
    "def xgboostcv(max_depth, learning_rate, n_estimators, gamma, min_child_weight, \n",
    "              subsample, colsample_bytree, silent = True, nthread = -1):\n",
    "    return cross_val_score(xgb.XGBClassifier(max_depth = int(max_depth), learning_rate = learning_rate, \n",
    "                                             n_estimators = int(n_estimators), silent = silent, nthread = nthread, \n",
    "                                             gamma = gamma, min_child_weight = min_child_weight, subsample = subsample, \n",
    "                                             colsample_bytree = colsample_bytree),\n",
    "                           x_train, \n",
    "                           y_train, \n",
    "                           scoring = 'roc_auc', \n",
    "                           cv=5).mean()\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    \n",
    "    xgboostBO = bayesian_optimization.BayesianOptimization(xgboostcv,\n",
    "                                 {'max_depth': (3, 7),\n",
    "                                  'learning_rate': (0.01, 0.1),\n",
    "                                  'n_estimators': (1000, 2000),\n",
    "                                  'gamma': (0.01, 1.),\n",
    "                                  'min_child_weight': (1, 5),\n",
    "                                  'subsample': (0.7, 0.9),\n",
    "                                  'colsample_bytree' :(0.7, 0.9)})\n",
    "    xgboostBO.maximize(init_points=2, n_iter = 10)\n",
    "    print('-'*53)\n",
    "    print('Final Results')\n",
    "    temp = xgboostBO.res\n",
    "    temp_list = []\n",
    "    for i in temp:\n",
    "        temp_list.append(i['target'])\n",
    "    print('XGBOOST: %f' % max(temp_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "ename": "XGBoostError",
     "evalue": "[21:15:00] src/objective/regression_obj.cu:101: label must be in [0,1] for logistic regression\nStack trace:\n  [bt] (0) 1   libxgboost.dylib                    0x0000000118939f09 dmlc::LogMessageFatal::~LogMessageFatal() + 57\n  [bt] (1) 2   libxgboost.dylib                    0x00000001189c8f18 xgboost::obj::RegLossObj<xgboost::obj::LogisticClassification>::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*) + 1160\n  [bt] (2) 3   libxgboost.dylib                    0x00000001189361f2 xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*) + 1330\n  [bt] (3) 4   libxgboost.dylib                    0x0000000118954684 XGBoosterUpdateOneIter + 180\n  [bt] (4) 5   _ctypes.cpython-37m-darwin.so       0x0000000106814177 ffi_call_unix64 + 79\n  [bt] (5) 6   ???                                 0x00007ffeeb7e6950 0x0 + 140732849350992\n\n",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mXGBoostError\u001b[0m                              Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-28-8a847f96d8a2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      4\u001b[0m              'objective': 'binary:logistic', 'max_depth':3.006, 'min_child_weight':4.982, 'gamma': 0.7437}\n\u001b[1;32m      5\u001b[0m df_cv = xgb.cv(params = param, dtrain = train_initial, num_boost_round = 1000, \n\u001b[0;32m----> 6\u001b[0;31m                nfold = 5, metrics = ['error'], seed = 0, early_stopping_rounds = 100)\n\u001b[0m\u001b[1;32m      7\u001b[0m \u001b[0mdf_cv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/training.py\u001b[0m in \u001b[0;36mcv\u001b[0;34m(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks, shuffle)\u001b[0m\n\u001b[1;32m    443\u001b[0m                            evaluation_result_list=None))\n\u001b[1;32m    444\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mfold\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcvfolds\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 445\u001b[0;31m             \u001b[0mfold\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    446\u001b[0m         \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maggcv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0meval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeval\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcvfolds\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    447\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/training.py\u001b[0m in \u001b[0;36mupdate\u001b[0;34m(self, iteration, fobj)\u001b[0m\n\u001b[1;32m    228\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miteration\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    229\u001b[0m         \u001b[0;34m\"\"\"\"Update the boosters for one iteration\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miteration\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    231\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    232\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0meval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miteration\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeval\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36mupdate\u001b[0;34m(self, dtrain, iteration, fobj)\u001b[0m\n\u001b[1;32m   1107\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mfobj\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1108\u001b[0m             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, ctypes.c_int(iteration),\n\u001b[0;32m-> 1109\u001b[0;31m                                                     dtrain.handle))\n\u001b[0m\u001b[1;32m   1110\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1111\u001b[0m             \u001b[0mpred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36m_check_call\u001b[0;34m(ret)\u001b[0m\n\u001b[1;32m    174\u001b[0m     \"\"\"\n\u001b[1;32m    175\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mret\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 176\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mXGBoostError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpy_str\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_LIB\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mXGBGetLastError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    177\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    178\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mXGBoostError\u001b[0m: [21:15:00] src/objective/regression_obj.cu:101: label must be in [0,1] for logistic regression\nStack trace:\n  [bt] (0) 1   libxgboost.dylib                    0x0000000118939f09 dmlc::LogMessageFatal::~LogMessageFatal() + 57\n  [bt] (1) 2   libxgboost.dylib                    0x00000001189c8f18 xgboost::obj::RegLossObj<xgboost::obj::LogisticClassification>::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*) + 1160\n  [bt] (2) 3   libxgboost.dylib                    0x00000001189361f2 xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*) + 1330\n  [bt] (3) 4   libxgboost.dylib                    0x0000000118954684 XGBoosterUpdateOneIter + 180\n  [bt] (4) 5   _ctypes.cpython-37m-darwin.so       0x0000000106814177 ffi_call_unix64 + 79\n  [bt] (5) 6   ???                                 0x00007ffeeb7e6950 0x0 + 140732849350992\n\n"
     ]
    }
   ],
   "source": [
    "# question 6: use xgb.cv to find the early stopping rounds based on error\n",
    "train_initial = xgb.DMatrix(x_train, y_train)\n",
    "param = {'eta': 0.01, 'seed':0, 'subsample': 0.7437, 'colsample_bytree': 0.8318, \n",
    "             'objective': 'binary:logistic', 'max_depth':3.006, 'min_child_weight':4.982, 'gamma': 0.7437}\n",
    "df_cv = xgb.cv(params = param, dtrain = train_initial, num_boost_round = 1000, \n",
    "               nfold = 5, metrics = ['error'], seed = 0, early_stopping_rounds = 100)\n",
    "df_cv.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAbQAAAD4CAYAAACE2RPlAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3deZxcZZ3v8c+XAIEECEuQGxFtlggSlgidSNhEyCCgsiiIipAEhgwOq164ckeMgRlHlBkXQGECgwnKAMMmDMgSgUAIENJZOgmERSFeBQZhgEjYAuF3/zhPwUmlqro6XV1VXf19v1796lPPec5zfnVS9I+z1O9RRGBmZtbXrdXoAMzMzGrBCc3MzFqCE5qZmbUEJzQzM2sJTmhmZtYS1m50AP3Z0KFDo62trdFhmJn1KXPnzn0pIjYvbndCa6C2tjY6OjoaHYaZWZ8i6Y+l2n3J0czMWoITmpmZtQQnNDMzawlOaGZm1hKc0MzMrCU4oZmZWUtwQjMzs5bghGZmZi3BX6xuoEXPLqPt7Nuq6rv0/M/1cjRmZn2bz9DMzKwlOKGZmVlLcEIzM7OW0CcTmqQPS7o+LY+UdEgV2+wn6dYax/FbSRvXckwzM1szfTKhRcRzEXFkejkS6DKh9VIch0TEq43Yt5mZraohCU3ScZIWSuqU9CtJX5A0W9J8Sb+TtEXqNzmtf0jSU5JOTO1tkhZLWhc4Dzha0gJJR0sanfrPl/SgpO2rjGlzSdMlPSrpckl/lDQ0rfuNpLlp3cTcNkslDU3xLJF0Wepzl6T1y+xnoqQOSR0r31jW00NpZmZJ3ROapBHAOcD+EbErcDrwALBHRHwSuAb4P7lNdgH2B8YAkyR9uLAiIlYAk4BrI2JkRFwLPA7sk8aaBPxzlaF9D7gnIkYA1wMfza07PiJ2B9qB0yRtVmL74cDP0/avAl8qtZOImBIR7RHRPmDQkCpDMzOzrjTie2j7A9dFxEsAEfGypJ2BayUNA9YFnsn1vzki3gTelHQvMBpYUGH8IcA0ScOBANapMq69gSNSTHdIeiW37jRJR6TlrciS1/8Ubf9MRBTimgu0VblfMzOrgWa5h3YRcHFE7Az8HbBebl0U9S1+XewfgXsjYifgC0VjdZuk/YCxwJh0Rjm/zJhv55ZX4i+tm5nVVSMS2j3AUYXLdpI2JTurejatH1fU/zBJ66X++wFzita/BmyYe50fa3w34poFfDnFdCCwSW68VyLiDUk7AHt0Y0wzM6uTuie0iHgU+D5wn6RO4MfAZOA6SXOBl4o2WQjcCzwM/GNEPFe0/l5gx8JDIcCPgB9Imk/3zpLOBQ6UtBg4CvhvsmR5B7C2pCXA+SkOMzNrMoro6gpe40iaDCyPiH+pw74GAisj4l1JY4BLImJkb+6zvb09Ojo6enMXZmYtR9LciGgvbvd9ng98FPhPSWsBK4ATGxyPmZl1Q1MntIiYXOsxJU0g+6pA3qyIOBn4ZK33Z2Zm9dHUlxxb3cBhw2PYuJ92eztPJWNm/Vm5S47N8ti+mZlZjzihmZlZS+jTCa03qu5LGi/p4lrGaWZmva9PJ7RmqbpvZmaN19CE1oxV94via5N0T4rxbkkfTe1Hpf12Sro/tY2Q9Eja/8JUS7LUmK62b2bWCxr22H6u6v6eEfFSKoEVZFX3Q9LfklXd/99pk13Iyk4NBuZLuq0wVkSskDQJaI+IU9L4G5FV3X9X0liyqvslK+BXcBEwLSKmSToeuBA4nKyK/2cj4ll9MMHnScDPIuKqlGAHlBowIqYAUyB7yrGb8ZiZWRmN/B5as1bdzxsDfDEt/4qsrBZkdR+nSvpP4MbU9hDwHUkfAW6MiKfWYH9mZraGmu0eWtNW3V9lxxEnkZ1dbgXMlbRZRPwHcCjwJvBbSfvXan9mZta1Ria0Zq26n/cg8JW0fAwwM8W6bUTMjohJwIvAVpK2AZ6OiAuBm8kukZqZWZ00LKE1cdX9vFOBCZIWAsfyQcmsCyQtSpX5HwQ6yaaeWSxpAbATcOUa7tPMzNZAnyh9Vc+q+/XkavtmZt3n0ldmZtbSmrrafkEDqu6bmVkf0ycuObYqV9s3M+s+X3I0M7OW5oRmZmYtocuEVqiXWI9g+jIfJzOzxmrIGZqkuj2MUs99mZlZ41Sb0AZIukzSo5LukrR+mn/s4VRZ/iZJmwBImiGpPS0PlbQ0LY+XdIuke4C7JQ2TdH/6IvRiSfuU27mk5ZJ+kvZ/t6TNU/u2ku6QNFfSTEk7pPapki6VNJsP6i8Wj7lI0sbK/I+k41L7lZL+RtIASRdImpPe49/ltj0r135uibG3SVX+R1V5fM3MrIeqTWjDgZ9HxAjgVbKq9VcC346IXYBFwPeqGGc34MiI+DTwNeDOiBgJ7ErlQsODgY60//ty+5oCnBoRuwNnAr/IbfMRskr+3yoz5ixgL2AE8DRQSKhjyKp/nAAsi4hRwCjgRElbSzowHY/RZHOw7S5p38KgaZqaG4DxEVFcnsvTx5iZ9ZJqL8c9ExGFhDMX2BbYOCLuS23TgOuqGGd6RLyclucAV0haB/hNbvxS3gOuTcu/Bm6UtAGwJ1mprEK/gbltrouIlRXGnAnsC/wRuASYKGlL4JWIeD0lrl0kFSYQHUKWyA5MP/NT+wap/f8Bm5PVcfxiRDxWaqeePsbMrHdUe4b2dm55JbBxuY7Au7lxiyvcv15YiIj7yRLKs2RTsRxXZSyQVdpfC3g1Ikbmfj5Ral9l3E92VrYPMIOsyPCRpALEgMjO/gpjbx0Rd6X2H+Tat4uIf0/bLCNLbHt3472YmVkNrOlDIcuAV3L3vY4luxQIsBTYPS0fSRmSPga8EBGXAZeTXY6sFGdhrK8BD0TEX4FnJB2VxpOkXat9AxHxJ2AoMDwingYeILtseX/qcifwjXQGiaSPSxqc2o9PZ4hI2lLSh9I2K4AjgOMkfa3aWMzMrOd68gTgOOBSSYPI7kFNSO3/AvynpInAbeU2JpsC5ixJ7wDLgUpnaK8DoyWdA/wFODq1HwNcktrXAa4hq3xfrdl8MLP0TOAHZIkNsiTbBsxTdk3zReDwiLhL0ieAh9KlzuXA18nOXEmXKz8PTJe0PCJu6UY8Zma2hvpE6auUGDZodBy15mr7Zmbd59JXZmbW0prqS8fpe2MDi5qP7cnZmavqm5n1D33ikmOrcrV9M7Pu8yVHMzNraU5oZmbWEpzQzMysJdQtoUk6XNKOudfnSRpbr/33VHH8ZmbWXOp5hnY48H5CiIhJEfG7Ou6/p1aJP89T1JiZNd4aJ7Q0oeWSEtPKnJimVumUdIOkQZL2BA4FLkjTxWybpng5UtJBkq7LjbufpFvT8oGSHpI0T9J1hXJTZeIZJenBtN9HJG0oaT1Jv0xTxcyX9JnUd7yki3Pb3ippv7S8XNL30zgPS9qiTPwzJP1UUgfwHUnP5MpkbZR/XRSnq+2bmfWCnp6hlZpW5saIGBURuwJLgBMi4kHgFuCsVND3D7kxfgd8KtVJhKys1TWShgLnAGMjYjegAyg5FYykdcmq8Z+e9jsWeBM4GYiI2Bn4KjBNUnHB5GKDgYfTOPcDJ1aIf92IaI+Ic8kKHBeep/9KOg7vFA8eEVPSNu0DBg3pIhQzM6tWTxNa8bQybcBOyibbXERWa3FEpQEi4l3gDuAL6dLd58imYNmD7BLfLEkLyGpHfqzMMNsDzxfmH4uIv6Zx9yabboaIeJxsqpiPd/GeVgC3Fr2ncq7NLV/OB/UsJwC/7GI/ZmZWQz2991M8rcz6wFSyIr6dksaTFSHuyjXAKcDLZBN5vpYKAk+PiK/2MMZS8lPcwKrT3LwTH3zbfCWVj1F+OpxZ6TLsfsCAiFhcq2DNzKxrvfFQyIbA8+n+0TG59tfSulLuI5s+5kSy5AbwMLCXpO0AJA2WVO7s6glgmKRRqe+G6WxvZiGGtO1HU9+lwEhJa0naimz26a5Uir/gSuA/8NmZmVnd9UZC+y7ZtCyzgMdz7deQTRczX9K2+Q3SzNK3Agen30TEi8B44GpJC4GHgB1K7TAiVpDde7tIUicwneys6xfAWuny57XA+Ih4O8X2DPAYcCEwr4r3VTb+nKuATYCrqxjPzMxqyLUca0jSkcBhEXFsNf09fYyZWfeVq+Xo70/ViKSLyM4wD2l0LGZm/VGfS2iSbgK2Lmr+dkTc2Yh4CiLi1Ebu38ysv+tzCS0ijmh0DLWy6NlltJ19W4/G8FQyZmYZFyc2M7OW4IRmZmYtwQnNzMxaghNaBZJ+I2luKr48MbWdIOnJVAD5skKRY0mbp2LMc9LPXo2N3sysf+lzD4XU2fER8bKk9YE5km4j++L4bmSVQ+4BOlPfnwE/iYgHJH0UuBP4RPGAKTFOBBiw0eZ1eAtmZv2DE1plp0kqPFW5FXAscF9EvAyQpr0plOMaC+yYlaAEYCNJG0TE8vyAETEFmAIwcNhwf6vdzKxGnNDKSEWGxwJjIuINSTPISnmtdtaVrAXsERFv1SdCMzPL8z208oYAr6RktgPZdDaDgU9L2iQVP/5Srv9dwPtfrpY0sq7Rmpn1c05o5d0BrC1pCXA+WfX/Z4F/Bh4hK3C8FChMO30a0C5poaTHgJPqHrGZWT/mS45lpKr8Bxe3S+qIiCnpDO0m4Dep/0tkFf/NzKwBnNC6b7KksWTT09xFSmhrYucth9Dh0lVmZjXhhNZNEXFmo2MwM7PV+R6amZm1BJ+hNVBPq+270r6Z2Qd8hmZmZi3BCc3MzFpCzROapPGFgr01HPNwSTvmXp+XnjQ0MzMD+s4Z2uHA+wktIiZFxO8aGI+ZmTWZbic0SV9PU6cskPRvkgZImlCYUgXYK9d3qqQjc6+X55a/LWmRpE5J56e2E9PUK51pKpZBkvYEDgUuSPvcNj+upAMkzU9jXSFpYGpfKulcSfPSuh0qvKfJadsZkp6WdFpqb5O0ONfvTEmT0/IMST+R1CFpiaRRkm6U9JSkf+rucTUzs57pVkKT9Amyahh7RcRIYCXwdeBcskS2N7kzqQrjHAwcBnwqInYFfpRW3RgRo1LbEuCEiHgQuAU4KyJGRsQfcuOsB0wFjo6Incme2vxGblcvRcRuwCVAV98f2wH4LDAa+J6kdbp6H8CKiGgHLgVuBk4GdgLGS9qszHufmJJgx8o3lpXqYmZma6C7Z2gHALuTzQ22IL3+JjAjIl6MiBXAtVWMMxb4ZUS8AVCYjgXYSdJMSYuAY4ARXYyzPfBMRDyZXk8D9s2tvzH9ngu0dTHWbRHxdiph9Rdgiyrexy3p9yLg0Yh4PpXMeppsupnVRMSUiGiPiPYBg4ZUsQszM6tGdxOagGnpTGlkRGwPTK7Q/93CPiStBazbxfhTgVPS2da5ZOWleuLt9HslXX/n7u3ccqH/+/EnxfEUtnmvaPv3qtifmZnVUHcT2t3AkZI+BCBpU2A+2ZQqm6XLdEfl+i8lO6OD7D5Y4TLedGCCpEG5cQA2BJ5P4xyTG+e1tK7YE0CbpO3S62OB+7r5nip5AfhQem8Dgc/XcGwzM6uhbiW0iHgMOAe4S9JCssQ0jOws7SGyKVWW5Da5jCzZdQJjgNfTOHeQXa7rSJcuC/e3vgvMTuM8nhvnGuCs9PDHtrl43gImANely5Tvkd3PqomIeAc4j2y6mOlFMZmZWRNRRDQ6hn6rvb09Ojo6Gh2GmVmfImlueiBvFX3le2hmZmYV9asHFyRNAE4vap4VESc3Ih4zM6sdX3JsoIHDhsewcT+t2Xiuvm9m/YEvOZqZWUtzQjMzs5bghGZmZi3BCa0MZXx8zMz6CP/BzknV9Z+QdCWwGPj3VEj4UUnn5vqNkvRgmhXgEUkbplkHLkizBSyU9HeNeydmZv1Pv3psv0rDgXER8bCkTSPiZUkDgLsl7UJWLeRasgr/cyRtBLwJnAAsi4hRqUzWLEl3RcQz+cElTQQmAgzYaPN6vi8zs5bmhLa6P0bEw2n5yykBrU1W4mtHIIDnI2IOQET8FUDSgcAuufnfhpAlx1USWkRMAaZA9th+L78XM7N+wwltda8DSNqarMbkqIh4RdJUKlf/F3BqRNzZ+yGamVkx30MrbyOy5LZM0hbAwan9CWCYpFEA6f7Z2sCdwDcKE4NK+rikwQ2I28ysX/IZWhkR0SlpPtk9sz+RzQBARKyQdDRwkaT1ye6fjQUuJ5tEdJ4kAS8ChzcidjOz/sgJLScilgI75V6PL9NvDrBHiVX/kH7MzKzOnNAaaOcth9Dh+otmZjXhe2hmZtYSnNDMzKwl+JJjAy16dhltZ99Wt/15ehkza2U+QzMzs5bQLxKapKWShlbZd7KkM3s7JjMzq62WT2ipDqOZmbW4pk5oks6SdFpa/omke9Ly/pKukvRVSYskLZb0w9x2yyX9q6ROYEyufX1Jt0s6Mb0+LlXG75T0qxL7PzFVz++UdIOkQan9qLTPTkn3p7YRqfL+gjTm8F49OGZmtoqmTmjATGCftNwObJBKS+0DPAn8ENgfGAmMklSozDEYmB0Ru0bEA6ltA+C/gKsj4jJJI4BzgP0jYlfg9BL7vzEiRqX1S8gq6gNMAj6b2g9NbScBP4uIkSnWP5d6Q5ImpilpOla+sazbB8TMzEpr9oQ2F9g9TdHyNvAQWbLYB3gVmBERL0bEu8BVwL5pu5XADUVj3Qz8MiKuTK/3B66LiJcAIuLlEvvfSdJMSYuAY4ARqX0WMDWd6RUuaT4E/IOkbwMfi4g3S72hiJgSEe0R0T5g0JDqj4SZmVXU1AktIt4hm35lPPAg2RnbZ4DtgKUVNn0rIlYWtc0CDkp1Fqs1FTglInYGziVV24+Ik8jO7rYC5kraLCL+g+xs7U3gt5L278Z+zMysh5o6oSUzyaZxuT8tnwTMBx4BPi1paHrw46vAfRXGmQS8Avw8vb4HOErSZgCSNi2xzYbA8+ky5zGFRknbRsTsiJhEVoR4K0nbAE9HxIVkZ4O7rOkbNjOz7usrCW0Y8FBEvAC8BcyMiOeBs4F7gU5gbkTc3MVYpwPrS/pRRDwKfB+4Lz088uMS/b8LzCY7u3s8135B4WEUsjPHTuDLwGJJC8gKHF9ZPJiZmfUeRXjS5EYZOGx4DBv307rtz5VCzKwVSJobEe3F7S591UCutm9mVjt94ZKjmZlZl5zQzMysJfiSYwPVu9p+Pfg+nZk1is/QzMysJTihmZlZS3BCMzOzluCEVoGkwZJuS1X1F0s6WtLuku6TNFfSnZKGSVo7VeXfL233A0nfb3D4Zmb9ih8Kqewg4LmI+ByApCHA7cBhEfGipKOB70fE8ZLGA9dLOjVt96lGBW1m1h85oVW2CPjXNNfarWS1IHcCpqcaxwOA5wEi4tE0p9qtwJiIWFFqQEkTgYkAAzbavNffgJlZf+GEVkFEPClpN+AQ4J/ICho/GhFjymyyM9m0Nh+qMOYUYApkpa9qG7GZWf/le2gVSPow8EZE/Bq4gOwy4uaSxqT166SJQpH0RWBTsjnZLpK0cYPCNjPrl3yGVtnOZJX13wPeAb4BvAtcmO6nrQ38VNILwPnAARHxJ0kXAz8DxjUobjOzfscJrYKIuBO4s8SqfUu0fTy33YW9FpSZmZXkhNZArrZvZlY7vodmZmYtwQnNzMxagi85NlAtqu27ur2ZWcZnaGZm1hKc0MzMrCX0m4Qm6TRJSyRdVaHP8hrsZ3z6QraZmdVRf7qH9vfA2Ij4cy/vZzywGHiul/djZmY5/eIMTdKlwDbA7ZKWSbpC0gxJT0s6rUT/n0s6NC3fJOmKtHx8YVoYSd+V9ISkByRdLelMSUcC7cBVkhZIWr9+79LMrH/rFwktIk4iO2P6DPATYAfgs8Bo4HuS1inaZCawT1reEtgxLe8D3C9pFPAlYFfgYLIkRkRcD3QAx0TEyIh4szgWSRMldUjqWPnGshq+SzOz/q1fJLQSbouItyPiJeAvwBZF62cC+0jaEXgMeEHSMGAM8CCwF3BzRLwVEa8B/1XtjiNiSkS0R0T7gEFDavJmzMysf91Dy3s7t7ySouMQEc+mavkHAfeTVdH/MrA8Il5Lc6GZmVkT6a9naNV4GDiDLKHNBM5MvwFmAV+QtJ6kDYDP57Z7DdiwnoGamZkTWiUzgbUj4vfAPLKztJkAETEHuAVYCNxONrN14YbYVOBSPxRiZlZfivCkyWtC0gYRsVzSILKzuIkRMa87YwwcNjyGjftpj+Jw6Ssz628kzY2I9uL2/noPrRampIdG1gOmdTeZgaePMTOrJSe0NRQRX2t0DGZm9gEntAaqRbV9W5UvwZr1X34oxMzMWoITmpmZtQQntBIkjZR0SO71oZLObmRMZmZWmRNaaSOB9xNaRNwSEec3MB4zM+tCUyU0Sd+StDj9nJHajpO0UFKnpF+lti1SFfzO9LOnpDZJi3NjnSlpclqeIeln6cvOiyWNTu2jJT0kab6kByVtL2ld4Dzg6NT/6DTH2cVpmzZJ96SY7pb00dQ+VdKFaZynU+V9MzOrk6Z5ylHS7sAE4FOAgNmS5gDnAHtGxEuSNk3dLwTui4gjJA0ANgA26WIXgyJipKR9gSuAnYDHgX0i4l1JY4F/jogvSZoEtEfEKSm28blxLiL73tk0ScenWA5P64YBe5NV878FuL7E+5wITAQYsNHm1R4eMzPrQtMkNLJEcFNEvA4g6UayaVmuS1XxiYiXU9/9geNS20pgmaSuEtrVqf/9kjZKxYc3BKZJGg4EUDyNTCljgC+m5V8BP8qt+01EvAc8Jqm4gj9p/1OAKZBVCqlif2ZmVoWmuuTYQ++y6vtZr2h9cfII4B+BeyNiJ+ALJbbprnwVf5fkNzOro2ZKaDOBwyUNkjQYOIJsssyjJG0GkLvkeDfwjdQ2QNIQ4AXgQ5I2kzSQVSvgAxyd+u8NLIuIZcAQ4Nm0fnyub6WK+Q8CX0nLx/BBBX4zM2ugpkloqRbiVOARYDZweUTMAr4P3CepE/hx6n468BlJi4C5wI4R8Q7ZwxyPANPJ7o/lvSVpPnApcEJq+xHwg9Sev/x6L7Bj4aGQonFOBSZIWggcm2IxM7MG6xfV9iXNAM6MiI5Gx5JXi2r7tiqXvjJrfa6234Rcbd/MrHb6RUKLiP0aHYOZmfWuprmHZmZm1hP94gytWbXi9DG+h2VmjeIzNDMzawlOaGZm1hKaLqFJ+rCk69PyKtO4VNhmP0m3dnM/bZK+Vqt+ZmbWWE2V0CStHRHPRUShUv0q07jUWBtQTaKqtp+ZmTVQTRJaOot5PE2h8qSkqySNlTRL0lNpmpbVpmpJ246XdIuke4C7C9PAlJnGpeQYVcT36TTGgrTthsD5wD6p7ZtpvzMlzUs/e6bNi/u9P5VMGvvWdIY4IL3/xZIWSfpmLY6tmZlVp5ZPOW4HHAUcD8whO6vZGzgU+Aey6virTNUCfCltuxuwS0S8LKkNICJWlJjGZaMKY1RyJnByRMyStAHwFnA2WfWQz6exBwF/ExFvper7V5NV+y/uN77MPkYCW6ZCx6Rq/qvx9DFmZr2jlgntmYhYBCDpUeDuiIhUb7GNrBBwualapuemhqmk0hiVzAJ+LOkq4MaI+LO0WjH8dYCLJY0EVgIfr3LsgqeBbSRdBNwG3FWqk6ePMTPrHbW8h5afOuW93Ov3yBJnpalaXq9yH2s03UtEnA/8LbA+MEvSDiW6fZOsYv+uZGdm65YZruQ0NRHxStp2BnAScHk1sZmZWW3U84vV5aZqqaR4Gpc1GQNJ26azx0WSRpHNKP2nEmP/OSLekzQOGFAmhqXA30taC9gSGJ32MRRYERE3SHoC+HW18ZmZWc/V8ynHclO1VFI8jcuajAFwRnpYYyHwDnA7sBBYKakzPcDxC2BcmqZmBz44ayzuNwt4BngMuBCYl/ptCcyQtIAsmf3fbsRnZmY91C+mj2lW7e3t0dHRVDPamJk1vXLTxzTV99DMzMzWVEsVJ5Y0gdVnkJ4VESc3Ih4zM6sfX3JsIM9YvTpX6zezrviSo5mZtTQnNDMzawl9PqEV11as0ZiHS9ox9/q8VGrLzMyaVJ9PaL3kcOD9hBYRkyLidw2Mx8zMutD0CU3S1yU9kr5c/W+pqv2EVNX/EWCvXN+pko7MvV6eW/52qoLfKen81HaipDmp7QZJg1KV/UOBC9I+t82PK+mAVLF/kaQrJA1M7UslnZsq9S8qU17LzMx6SVMnNEmfAI4G9oqIQtHgrwPnkiWyvcmdSVUY52DgMOBTEbErWcURyAoVj0ptS4ATIuJB4BbgrIgYGRF/yI2zHjAVODoidib72sM3crt6KSJ2Ay4hq/BfKpaJkjokdax8Y1m1h8LMzLrQ1AkNOADYHZiTSkodQFZEeEZEvBgRK4BrqxhnLPDLiHgDIFfZf6c0B9oi4BhgRBfjbE82q8CT6fU0YN/c+hvT77lkMwysJiKmRER7RLQPGDSkitDNzKwazZ7QBExLZ0ojI2J7YHKF/u9Xwk/Fg8tVzC+YCpySzrbOpcrq/RUUZhhYSYt9ad3MrNk1e0K7GzhS0ocAJG0KzAc+LWkzSeuQTSpasJTsjA6y+2CF+dKmAxPSJJ6FcSCrov98GueY3DjFFfYLngDaJG2XXh8L3Lfmb8/MzGqlqRNaRDwGnAPclSrlTweGkZ2lPURW+X5JbpPLyJJdJzCGVDE/Iu4guy/WkS5dFu5vfReYncZ5PDfONcBZ6eGPbXPxvAVMAK5LlynfAy6t5Xs2M7M149JXDeTSV6tz6Ssz60q50le+z9NAO285hA7/ATczq4mmvuRoZmZWLZ+hNdCiZ5fRdvZtjQ7DzKyueuvWgs/QzMysJTihmZlZS2j6hCZphqTVnmYp6nNG4Ttm6fVvJW1cwxgmSypXyurBWu3HzMzWXFMkNGV6EssZwPsJLSIOiYhXex5Z1yJiz3rsx8zMKmtYQpPUJukJSVcCi4FjJT2UqtVfJ2mDEttckgr7Pirp3NR2GvBh4F5J96a2pZKGpuVvSVqcfs7I7XuJpMvSWHdJWr8wnqTHJC2UdE1u9zums8Wn0z4LMS1Pv/eTdL+k29L7urSHSdrMzLqh0X9whwO/AD4NnACMTdXqO4Bvlej/nfRlul3IKoLsEhEXAvYFCaAAAAcZSURBVM8Bn4mIz+Q7S9qdrLLHp4A9gBMlfTK3759HxAjgVeBLqf1s4JMRsQtwUm64HYDPAqOB76VyWcVGA6eSzQCwLfDF4g6utm9m1jsandD+GBEPkyWbHYFZqTTVOOBjJfp/WdI8snqOI+h66pi9gZsi4vWIWE5WDX+ftO6ZiFiQlvPV8RcCV0n6Olmx44LbIuLtiHgJ+AuwRYn9PRIRT0fESuDqtP9VuNq+mVnvaPT30F5PvwVMj4ivlusoaWuyGoyjIuIVSVPpWXX8t3PLK4H10/LnyKaE+QLwHUk7l+lf6tgV1xFzXTEzszpp9BlawcPAXoUq9pIGS/p4UZ+NyBLgMklbAAfn1pWrjj8TODzNRD0YOCK1lZTueW0VEfcC3waGAKvdy6tgtKSt0zhHAw90Y1szM+uBRp+hARARL0oaD1wtaWBqPgd4MtenU9J8sqr4fyKrkF8wBbhD0nP5+2gRMS+dyT2Smi6PiPmS2sqEMgD4taQhZGeNF0bEq5KqfStzgIuB7YB7gZuq3dDMzHrG1fZrRNJ+wJkR8flqt3G1fTPrj3pa+srV9puQq+2bmdWOE1qNRMQMYEaDwzAz67ea5aEQMzOzHnFCMzOzluCEZmZmLcEJzczMWoITmpmZtQQnNDMzawlOaGZm1hKc0MzMrCW49FUDSXoNeKLRcXTDUOClRgfRTX0t5r4WL/S9mPtavND3Yu7teD8WEZsXN7pSSGM9UaoeWbOS1NGX4oW+F3Nfixf6Xsx9LV7oezE3Kl5fcjQzs5bghGZmZi3BCa2xpjQ6gG7qa/FC34u5r8ULfS/mvhYv9L2YGxKvHwoxM7OW4DM0MzNrCU5oZmbWEpzQ6kDSQZKekPR7SWeXWD9Q0rVp/WxJbfWP8v1YtpJ0r6THJD0q6fQSffaTtEzSgvQzqRGxFsW0VNKiFE9HifWSdGE6xgsl7daIOFMs2+eO3QJJf5V0RlGfhh9jSVdI+oukxbm2TSVNl/RU+r1JmW3HpT5PSRrXwHgvkPR4+je/SdLGZbat+Pmpc8yTJT2b+7c/pMy2Ff+u1DHea3OxLpW0oMy2vX+MI8I/vfgDDAD+AGwDrAt0AjsW9fl74NK0/BXg2gbGOwzYLS1vCDxZIt79gFsbfWyLYloKDK2w/hDgdkDAHsDsRsec+3z8N9kXRZvqGAP7ArsBi3NtPwLOTstnAz8ssd2mwNPp9yZpeZMGxXsgsHZa/mGpeKv5/NQ55snAmVV8bir+XalXvEXr/xWY1Khj7DO03jca+H1EPB0RK4BrgMOK+hwGTEvL1wMHSFIdY3xfRDwfEfPS8mvAEmDLRsRSY4cBV0bmYWBjScMaHRRwAPCHiPhjowMpFhH3Ay8XNec/q9OAw0ts+llgekS8HBGvANOBg3ot0KRUvBFxV0S8m14+DHykt+PojjLHuBrV/F2puUrxpr9ZXwau7u04ynFC631bAn/Kvf4zqyeI9/uk//iWAZvVJboK0qXPTwKzS6weI6lT0u2SRtQ1sNICuEvSXEkTS6yv5t+hEb5C+T8AzXaMAbaIiOfT8n8DW5To06zH+niys/RSuvr81Nsp6TLpFWUu6zbjMd4HeCEiniqzvtePsROalSRpA+AG4IyI+GvR6nlkl8h2BS4CflPv+ErYOyJ2Aw4GTpa0b6MD6oqkdYFDgetKrG7GY7yKyK4j9Ynv/Uj6DvAucFWZLs30+bkE2BYYCTxPdhmvL/gqlc/Oev0YO6H1vmeBrXKvP5LaSvaRtDYwBPifukRXgqR1yJLZVRFxY/H6iPhrRCxPy78F1pE0tM5hFsf0bPr9F+AmsksyedX8O9TbwcC8iHiheEUzHuPkhcKl2vT7LyX6NNWxljQe+DxwTErCq6ni81M3EfFCRKyMiPeAy8rE0mzHeG3gi8C15frU4xg7ofW+OcBwSVun/yP/CnBLUZ9bgMKTYEcC95T7D6+3pevg/w4siYgfl+nzvwr3+CSNJvscNTIBD5a0YWGZ7EGAxUXdbgGOS0877gEsy106a5Sy/0fbbMc4J/9ZHQfcXKLPncCBkjZJl8sOTG11J+kg4P8Ah0bEG2X6VPP5qZuie7tHlImlmr8r9TQWeDwi/lxqZd2OcW8/FeOf95+we5LsqaTvpLbzyP4jA1iP7LLT74FHgG0aGOveZJeRFgIL0s8hwEnASanPKcCjZE9WPQzs2eDju02KpTPFVTjG+ZgF/Dz9GywC2hsc82CyBDUk19ZUx5gs2T4PvEN2j+YEsnu7dwNPAb8DNk1924HLc9senz7PvwcmNDDe35Pdayp8lgtPE38Y+G2lz08DY/5V+owuJEtSw4pjTq9X+7vSiHhT+9TCZzfXt+7H2KWvzMysJfiSo5mZtQQnNDMzawlOaGZm1hKc0MzMrCU4oZmZWUtwQjMzs5bghGZmZi3h/wNUKkZqgqGteQAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# question 7: plot feature importance\n",
    "feature_imp = xgb.train(param, train_initial, num_boost_round = 1000)\n",
    "imp_dict = feature_imp.get_score(importance_type = 'gain')\n",
    "X = np.arange(len(imp_dict))\n",
    "plt.barh(np.arange(len(imp_dict)), imp_dict.values(), align='center')\n",
    "plt.yticks(X, imp_dict.keys())\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n",
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/xgboost/core.py:588: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  data.base is not None and isinstance(data, np.ndarray) \\\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.7076049511078455"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# question 8: analyze auc score on Test Data,\n",
    "test_initial = xgb.DMatrix(x_test, y_test)\n",
    "predict_result = feature_imp.predict(test_initial)\n",
    "roc_auc_score(y_test, predict_result.round())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
